knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(yaml)
library(here)
library(plotly)
library(scales)
library(pander)
library(kableExtra)PROJECT OVERVIEW
Objective:
Examining patterns by combining replicate CHANGE-Seq output from datasets.
Approach & Procedure:
DNA Lines: The data files for the target sites (A, E, and F; described below) from the CHANGE-Seq manuscript were retrieved from DNAnexus (given by Yichao via wget) and run through CHANGE-Seq modeled by their yaml files. All parameters were maintained. The only difference, bioinformatically, between these samples and the ones in the manuscript is the version number of CHANGE-Seq (the ones here is using an updated version - 1.2.7).
All other data were run with the same (1.2.7) version of CHANGE-Seq and by myself (Sierra D. Miller).
Sample Table
Experimental Factors
VENN TABLES
From each datasetās respective analyses, tables and data points are pooled into summary tables here.
CHANGE-Seq Manuscript: Figure 1i Samples
(fig1i_vt <- read_tsv(here("data-raw/tables/Fig1i_venntable.tsv"))) %>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site | target_site_total | REP1 | REP1-REP2 | REP2 | max_on_target_readcount | max_off_target_readcount |
|---|---|---|---|---|---|---|
| EMX1 | 983 | 358 (36%) | 241 (25%) | 384 (39%) | 18884 | 6792 |
| FANCF | 742 | 235 (32%) | 137 (18%) | 370 (50%) | 2960 | 2618 |
| HEK293-site-1 | 212 | 86 (41%) | 57 (27%) | 69 (33%) | 1840 | 328 |
| HEK293-site-2 | 1043 | 408 (39%) | 165 (16%) | 470 (45%) | 13480 | 1634 |
| HEK293-site-3 | 800 | 309 (39%) | 157 (20%) | 334 (42%) | 6296 | 3254 |
| HEK293-site-4 | 4910 | 1481 (30%) | 1710 (35%) | 1719 (35%) | 31912 | 35312 |
| RNF2 | 120 | 18 (15%) | 15 (12%) | 87 (72%) | 934 | 130 |
| VEGFA-site-1 | 2113 | 547 (26%) | 797 (38%) | 769 (36%) | 15016 | 26736 |
| VEGFA-site-2 | 4694 | 1151 (25%) | 2015 (43%) | 1528 (33%) | 904 | 15032 |
| VEGFA-site-3 | 2904 | 795 (27%) | 1014 (35%) | 1095 (38%) | 82 | 2590 |
CHANGE-Seq Manuscript GIAB DNA Lines
Samples were prepped and sequenced at St.Ā Jude and run through bioinformatics pipeline by Sierra D. Miller
Pooled target sites, grouped by DNA line:
(dna_lines_vt <- read_tsv(here("data-raw/tables/DNALines_venntable.tsv")))%>%
kbl() %>%
kable_material(c("striped", "hover"))| DNA_line | DNA_line_total | REP1 | REP1-REP2 | REP2 | max_on_target_readcount | max_off_target_readcount |
|---|---|---|---|---|---|---|
| GM12878 | 15282 | 6028 (39%) | 4970 (33%) | 4284 (28%) | 12488 | 7298 |
| GM24143 | 20024 | 8071 (40%) | 5945 (30%) | 6008 (30%) | 17948 | 13082 |
| GM24149 | 16756 | 5591 (33%) | 5420 (32%) | 5745 (34%) | 12256 | 9740 |
| GM24385 | 14218 | 4993 (35%) | 4925 (35%) | 4300 (30%) | 24354 | 17222 |
| GM24631 | 16014 | 4888 (31%) | 5190 (32%) | 5936 (37%) | 20978 | 13058 |
| GM24694 | 13027 | 3646 (28%) | 3454 (27%) | 5927 (45%) | 16758 | 10686 |
| GM24695 | 14078 | 6780 (48%) | 4327 (31%) | 2971 (21%) | 18140 | 8090 |
Pooled DNA lines, grouped by target site:
(dna_lines_pertarg_vt <- read_tsv(here("data-raw/tables/pertarget_venntable.tsv")))%>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site | target_site_total | GM12878 | GM12878-GM24143 | GM12878-GM24143-GM24149 | GM12878-GM24143-GM24149-GM24385 | GM12878-GM24143-GM24149-GM24385-GM24631 | GM12878-GM24143-GM24149-GM24385-GM24631-GM24694 | GM12878-GM24143-GM24149-GM24385-GM24631-GM24694-GM24695 | GM12878-GM24143-GM24149-GM24385-GM24631-GM24695 | GM12878-GM24143-GM24149-GM24385-GM24694 | GM12878-GM24143-GM24149-GM24385-GM24694-GM24695 | GM12878-GM24143-GM24149-GM24385-GM24695 | GM12878-GM24143-GM24149-GM24631 | GM12878-GM24143-GM24149-GM24631-GM24694 | GM12878-GM24143-GM24149-GM24631-GM24694-GM24695 | GM12878-GM24143-GM24149-GM24631-GM24695 | GM12878-GM24143-GM24149-GM24694 | GM12878-GM24143-GM24149-GM24694-GM24695 | GM12878-GM24143-GM24149-GM24695 | GM12878-GM24143-GM24385 | GM12878-GM24143-GM24385-GM24631 | GM12878-GM24143-GM24385-GM24631-GM24694 | GM12878-GM24143-GM24385-GM24631-GM24694-GM24695 | GM12878-GM24143-GM24385-GM24631-GM24695 | GM12878-GM24143-GM24385-GM24694 | GM12878-GM24143-GM24385-GM24694-GM24695 | GM12878-GM24143-GM24385-GM24695 | GM12878-GM24143-GM24631 | GM12878-GM24143-GM24631-GM24694 | GM12878-GM24143-GM24631-GM24694-GM24695 | GM12878-GM24143-GM24631-GM24695 | GM12878-GM24143-GM24694 | GM12878-GM24143-GM24694-GM24695 | GM12878-GM24143-GM24695 | GM12878-GM24149 | GM12878-GM24149-GM24385 | GM12878-GM24149-GM24385-GM24631 | GM12878-GM24149-GM24385-GM24631-GM24694 | GM12878-GM24149-GM24385-GM24631-GM24694-GM24695 | GM12878-GM24149-GM24385-GM24631-GM24695 | GM12878-GM24149-GM24385-GM24694 | GM12878-GM24149-GM24385-GM24694-GM24695 | GM12878-GM24149-GM24385-GM24695 | GM12878-GM24149-GM24631 | GM12878-GM24149-GM24631-GM24694 | GM12878-GM24149-GM24631-GM24694-GM24695 | GM12878-GM24149-GM24631-GM24695 | GM12878-GM24149-GM24694 | GM12878-GM24149-GM24694-GM24695 | GM12878-GM24149-GM24695 | GM12878-GM24385 | GM12878-GM24385-GM24631 | GM12878-GM24385-GM24631-GM24694 | GM12878-GM24385-GM24631-GM24694-GM24695 | GM12878-GM24385-GM24631-GM24695 | GM12878-GM24385-GM24694 | GM12878-GM24385-GM24694-GM24695 | GM12878-GM24385-GM24695 | GM12878-GM24631 | GM12878-GM24631-GM24694 | GM12878-GM24631-GM24694-GM24695 | GM12878-GM24631-GM24695 | GM12878-GM24694 | GM12878-GM24694-GM24695 | GM12878-GM24695 | GM24143 | GM24143-GM24149 | GM24143-GM24149-GM24385 | GM24143-GM24149-GM24385-GM24631 | GM24143-GM24149-GM24385-GM24631-GM24694 | GM24143-GM24149-GM24385-GM24631-GM24694-GM24695 | GM24143-GM24149-GM24385-GM24631-GM24695 | GM24143-GM24149-GM24385-GM24694 | GM24143-GM24149-GM24385-GM24694-GM24695 | GM24143-GM24149-GM24385-GM24695 | GM24143-GM24149-GM24631 | GM24143-GM24149-GM24631-GM24694 | GM24143-GM24149-GM24631-GM24694-GM24695 | GM24143-GM24149-GM24631-GM24695 | GM24143-GM24149-GM24694 | GM24143-GM24149-GM24694-GM24695 | GM24143-GM24149-GM24695 | GM24143-GM24385 | GM24143-GM24385-GM24631 | GM24143-GM24385-GM24631-GM24694 | GM24143-GM24385-GM24631-GM24694-GM24695 | GM24143-GM24385-GM24631-GM24695 | GM24143-GM24385-GM24694 | GM24143-GM24385-GM24694-GM24695 | GM24143-GM24385-GM24695 | GM24143-GM24631 | GM24143-GM24631-GM24694 | GM24143-GM24631-GM24694-GM24695 | GM24143-GM24631-GM24695 | GM24143-GM24694 | GM24143-GM24694-GM24695 | GM24143-GM24695 | GM24149 | GM24149-GM24385 | GM24149-GM24385-GM24631 | GM24149-GM24385-GM24631-GM24694 | GM24149-GM24385-GM24631-GM24694-GM24695 | GM24149-GM24385-GM24631-GM24695 | GM24149-GM24385-GM24694 | GM24149-GM24385-GM24694-GM24695 | GM24149-GM24385-GM24695 | GM24149-GM24631 | GM24149-GM24631-GM24694 | GM24149-GM24631-GM24694-GM24695 | GM24149-GM24631-GM24695 | GM24149-GM24694 | GM24149-GM24694-GM24695 | GM24149-GM24695 | GM24385 | GM24385-GM24631 | GM24385-GM24631-GM24694 | GM24385-GM24631-GM24694-GM24695 | GM24385-GM24631-GM24695 | GM24385-GM24694 | GM24385-GM24694-GM24695 | GM24385-GM24695 | GM24631 | GM24631-GM24694 | GM24631-GM24694-GM24695 | GM24631-GM24695 | GM24694 | GM24694-GM24695 | GM24695 | max_on_target_readcount | max_off_target_readcount |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAVS1_site_14 | 8063 | 306 (4%) | 35 (0%) | 18 (0%) | 14 (0%) | 14 (0%) | 34 (0%) | 1369 (17%) | 44 (1%) | 13 (0%) | 32 (0%) | 10 (0%) | 21 (0%) | 22 (0%) | 114 (1%) | 21 (0%) | 22 (0%) | 34 (0%) | 10 (0%) | 14 (0%) | 2 (0%) | 15 (0%) | 64 (1%) | 16 (0%) | 9 (0%) | 8 (0%) | 10 (0%) | 26 (0%) | 16 (0%) | 32 (0%) | 17 (0%) | 33 (0%) | 8 (0%) | 31 (0%) | 61 (1%) | 13 (0%) | 10 (0%) | 12 (0%) | 36 (0%) | 10 (0%) | 10 (0%) | 10 (0%) | 14 (0%) | 25 (0%) | 26 (0%) | 20 (0%) | 21 (0%) | 23 (0%) | 6 (0%) | 22 (0%) | 42 (1%) | 15 (0%) | 6 (0%) | 11 (0%) | 10 (0%) | 5 (0%) | 10 (0%) | 16 (0%) | 81 (1%) | 33 (0%) | 23 (0%) | 21 (0%) | 56 (1%) | 31 (0%) | 92 (1%) | 407 (5%) | 95 (1%) | 20 (0%) | 11 (0%) | 23 (0%) | 96 (1%) | 28 (0%) | 11 (0%) | 23 (0%) | 14 (0%) | 26 (0%) | 24 (0%) | 74 (1%) | 32 (0%) | 45 (1%) | 37 (0%) | 22 (0%) | 48 (1%) | 20 (0%) | 14 (0%) | 34 (0%) | 12 (0%) | 3 (0%) | 19 (0%) | 18 (0%) | 92 (1%) | 40 (0%) | 44 (1%) | 40 (0%) | 95 (1%) | 25 (0%) | 103 (1%) | 410 (5%) | 49 (1%) | 29 (0%) | 14 (0%) | 30 (0%) | 11 (0%) | 22 (0%) | 9 (0%) | 26 (0%) | 90 (1%) | 39 (0%) | 25 (0%) | 43 (1%) | 81 (1%) | 37 (0%) | 100 (1%) | 258 (3%) | 65 (1%) | 28 (0%) | 24 (0%) | 32 (0%) | 63 (1%) | 15 (0%) | 28 (0%) | 545 (7%) | 80 (1%) | 54 (1%) | 124 (2%) | 434 (5%) | 90 (1%) | 408 (5%) | 24354 | 17222 |
| CCR5_site_3 | 18098 | 914 (5%) | 202 (1%) | 99 (1%) | 86 (0%) | 171 (1%) | 166 (1%) | 3504 (19%) | 542 (3%) | 48 (0%) | 150 (1%) | 138 (1%) | 73 (0%) | 56 (0%) | 118 (1%) | 121 (1%) | 24 (0%) | 34 (0%) | 36 (0%) | 127 (1%) | 86 (0%) | 68 (0%) | 118 (1%) | 112 (1%) | 37 (0%) | 56 (0%) | 73 (0%) | 98 (1%) | 28 (0%) | 42 (0%) | 72 (0%) | 54 (0%) | 27 (0%) | 50 (0%) | 205 (1%) | 112 (1%) | 74 (0%) | 51 (0%) | 96 (1%) | 104 (1%) | 38 (0%) | 27 (0%) | 61 (0%) | 91 (1%) | 24 (0%) | 34 (0%) | 59 (0%) | 22 (0%) | 24 (0%) | 73 (0%) | 185 (1%) | 95 (1%) | 16 (0%) | 33 (0%) | 53 (0%) | 57 (0%) | 23 (0%) | 71 (0%) | 210 (1%) | 39 (0%) | 21 (0%) | 65 (0%) | 109 (1%) | 27 (0%) | 146 (1%) | 706 (4%) | 171 (1%) | 82 (0%) | 96 (1%) | 54 (0%) | 110 (1%) | 115 (1%) | 38 (0%) | 45 (0%) | 69 (0%) | 70 (0%) | 36 (0%) | 44 (0%) | 76 (0%) | 46 (0%) | 18 (0%) | 63 (0%) | 168 (1%) | 114 (1%) | 24 (0%) | 48 (0%) | 60 (0%) | 36 (0%) | 32 (0%) | 61 (0%) | 196 (1%) | 63 (0%) | 18 (0%) | 51 (0%) | 71 (0%) | 31 (0%) | 136 (1%) | 840 (5%) | 239 (1%) | 84 (0%) | 31 (0%) | 24 (0%) | 55 (0%) | 39 (0%) | 19 (0%) | 93 (1%) | 180 (1%) | 48 (0%) | 19 (0%) | 60 (0%) | 111 (1%) | 22 (0%) | 152 (1%) | 790 (4%) | 210 (1%) | 47 (0%) | 18 (0%) | 56 (0%) | 84 (0%) | 25 (0%) | 137 (1%) | 789 (4%) | 115 (1%) | 11 (0%) | 138 (1%) | 346 (2%) | 62 (0%) | 531 (3%) | 17384 | 2952 |
| CCR5_site_8 | 11985 | 872 (7%) | 89 (1%) | 43 (0%) | 20 (0%) | 47 (0%) | 59 (0%) | 1504 (13%) | 91 (1%) | 25 (0%) | 37 (0%) | 19 (0%) | 29 (0%) | 22 (0%) | 63 (1%) | 31 (0%) | 19 (0%) | 5 (0%) | 17 (0%) | 15 (0%) | 20 (0%) | 18 (0%) | 38 (0%) | 22 (0%) | 18 (0%) | 8 (0%) | 5 (0%) | 34 (0%) | 19 (0%) | 10 (0%) | 20 (0%) | 17 (0%) | 7 (0%) | 18 (0%) | 106 (1%) | 22 (0%) | 18 (0%) | 24 (0%) | 55 (0%) | 12 (0%) | 16 (0%) | 12 (0%) | 6 (0%) | 56 (0%) | 29 (0%) | 15 (0%) | 12 (0%) | 21 (0%) | 14 (0%) | 22 (0%) | 83 (1%) | 30 (0%) | 20 (0%) | 16 (0%) | 7 (0%) | 15 (0%) | 6 (0%) | 19 (0%) | 144 (1%) | 33 (0%) | 16 (0%) | 23 (0%) | 67 (1%) | 20 (0%) | 70 (1%) | 964 (8%) | 134 (1%) | 39 (0%) | 40 (0%) | 34 (0%) | 76 (1%) | 28 (0%) | 16 (0%) | 20 (0%) | 20 (0%) | 57 (0%) | 31 (0%) | 30 (0%) | 29 (0%) | 24 (0%) | 15 (0%) | 33 (0%) | 82 (1%) | 26 (0%) | 18 (0%) | 14 (0%) | 22 (0%) | 9 (0%) | 5 (0%) | 14 (0%) | 135 (1%) | 31 (0%) | 17 (0%) | 19 (0%) | 94 (1%) | 24 (0%) | 70 (1%) | 1119 (9%) | 89 (1%) | 40 (0%) | 20 (0%) | 16 (0%) | 16 (0%) | 32 (0%) | 4 (0%) | 11 (0%) | 147 (1%) | 40 (0%) | 13 (0%) | 15 (0%) | 91 (1%) | 20 (0%) | 82 (1%) | 677 (6%) | 118 (1%) | 25 (0%) | 15 (0%) | 9 (0%) | 59 (0%) | 7 (0%) | 58 (0%) | 1266 (11%) | 127 (1%) | 36 (0%) | 85 (1%) | 711 (6%) | 34 (0%) | 563 (5%) | 17634 | 13058 |
| CTLA4_site_9 | 6814 | 363 (5%) | 73 (1%) | 27 (0%) | 26 (0%) | 26 (0%) | 74 (1%) | 900 (13%) | 16 (0%) | 34 (0%) | 38 (1%) | 16 (0%) | 19 (0%) | 13 (0%) | 22 (0%) | 11 (0%) | 21 (0%) | 19 (0%) | 10 (0%) | 31 (0%) | 26 (0%) | 16 (0%) | 8 (0%) | 4 (0%) | 16 (0%) | 2 (0%) | 3 (0%) | 44 (1%) | 11 (0%) | 3 (0%) | 3 (0%) | 23 (0%) | 11 (0%) | 17 (0%) | 85 (1%) | 26 (0%) | 15 (0%) | 32 (0%) | 20 (0%) | 12 (0%) | 16 (0%) | 12 (0%) | 8 (0%) | 22 (0%) | 24 (0%) | 12 (0%) | 6 (0%) | 32 (0%) | 4 (0%) | 15 (0%) | 59 (1%) | 32 (0%) | 10 (0%) | 6 (0%) | 4 (0%) | 22 (0%) | NA | 8 (0%) | 86 (1%) | 26 (0%) | 12 (0%) | 5 (0%) | 69 (1%) | 6 (0%) | 28 (0%) | 428 (6%) | 108 (2%) | 30 (0%) | 44 (1%) | 50 (1%) | 26 (0%) | 18 (0%) | 18 (0%) | 10 (0%) | 10 (0%) | 35 (1%) | 27 (0%) | 19 (0%) | 9 (0%) | 56 (1%) | 6 (0%) | 18 (0%) | 75 (1%) | 41 (1%) | 12 (0%) | 2 (0%) | 4 (0%) | 36 (1%) | NA | 10 (0%) | 92 (1%) | 36 (1%) | 2 (0%) | 7 (0%) | 64 (1%) | 7 (0%) | 18 (0%) | 503 (7%) | 85 (1%) | 42 (1%) | 23 (0%) | 15 (0%) | 10 (0%) | 27 (0%) | 16 (0%) | 6 (0%) | 114 (2%) | 32 (0%) | 2 (0%) | 8 (0%) | 97 (1%) | 8 (0%) | 42 (1%) | 413 (6%) | 73 (1%) | 21 (0%) | 6 (0%) | 10 (0%) | 76 (1%) | 13 (0%) | 27 (0%) | 430 (6%) | 97 (1%) | 14 (0%) | 30 (0%) | 443 (7%) | 28 (0%) | 145 (2%) | NA | NA |
| CXCR4_site_7 | 12029 | 783 (7%) | 353 (3%) | 66 (1%) | 22 (0%) | 10 (0%) | 4 (0%) | 411 (3%) | 13 (0%) | 8 (0%) | 34 (0%) | 30 (0%) | 29 (0%) | 10 (0%) | 32 (0%) | 32 (0%) | 10 (0%) | 29 (0%) | 30 (0%) | 45 (0%) | 10 (0%) | NA | 11 (0%) | 19 (0%) | 7 (0%) | 16 (0%) | 37 (0%) | 30 (0%) | 12 (0%) | 4 (0%) | 4 (0%) | 43 (0%) | 21 (0%) | 72 (1%) | 68 (1%) | 17 (0%) | 2 (0%) | NA | 2 (0%) | NA | 2 (0%) | NA | NA | NA | 2 (0%) | 2 (0%) | 4 (0%) | 9 (0%) | 2 (0%) | 5 (0%) | 55 (0%) | 4 (0%) | NA | NA | 2 (0%) | NA | NA | 5 (0%) | 38 (0%) | 10 (0%) | 1 (0%) | 4 (0%) | 28 (0%) | 6 (0%) | 47 (0%) | 4394 (37%) | 363 (3%) | 46 (0%) | 4 (0%) | 12 (0%) | 22 (0%) | 14 (0%) | 13 (0%) | 14 (0%) | 17 (0%) | 45 (0%) | 10 (0%) | 18 (0%) | 10 (0%) | 42 (0%) | 28 (0%) | 52 (0%) | 183 (2%) | 24 (0%) | 5 (0%) | 2 (0%) | 10 (0%) | 15 (0%) | 10 (0%) | 23 (0%) | 197 (2%) | 27 (0%) | 10 (0%) | 61 (1%) | 199 (2%) | 33 (0%) | 355 (3%) | 811 (7%) | 48 (0%) | 2 (0%) | NA | NA | NA | 6 (0%) | NA | 7 (0%) | 47 (0%) | 5 (0%) | 2 (0%) | 1 (0%) | 45 (0%) | 5 (0%) | 70 (1%) | 481 (4%) | 19 (0%) | NA | NA | 5 (0%) | 25 (0%) | NA | 20 (0%) | 396 (3%) | 17 (0%) | 2 (0%) | 25 (0%) | 381 (3%) | 33 (0%) | 766 (6%) | NA | NA |
| TRAC_site_1 | 8017 | 377 (5%) | 35 (0%) | 33 (0%) | 19 (0%) | 16 (0%) | 36 (0%) | 1102 (14%) | 62 (1%) | 14 (0%) | 44 (1%) | 24 (0%) | 29 (0%) | 24 (0%) | 66 (1%) | 18 (0%) | 10 (0%) | 30 (0%) | 26 (0%) | 21 (0%) | 5 (0%) | 15 (0%) | 26 (0%) | 10 (0%) | 4 (0%) | 14 (0%) | 14 (0%) | 24 (0%) | 4 (0%) | 12 (0%) | 11 (0%) | 14 (0%) | 10 (0%) | 19 (0%) | 80 (1%) | 34 (0%) | 23 (0%) | 10 (0%) | 46 (1%) | 33 (0%) | 16 (0%) | 18 (0%) | 23 (0%) | 41 (1%) | 14 (0%) | 30 (0%) | 43 (1%) | 25 (0%) | 33 (0%) | 32 (0%) | 60 (1%) | 14 (0%) | 2 (0%) | 20 (0%) | 16 (0%) | 8 (0%) | 20 (0%) | 16 (0%) | 85 (1%) | 27 (0%) | 15 (0%) | 18 (0%) | 59 (1%) | 31 (0%) | 98 (1%) | 389 (5%) | 84 (1%) | 12 (0%) | 10 (0%) | 20 (0%) | 44 (1%) | 26 (0%) | 20 (0%) | 24 (0%) | 17 (0%) | 27 (0%) | 12 (0%) | 22 (0%) | 26 (0%) | 24 (0%) | 23 (0%) | 27 (0%) | 57 (1%) | 10 (0%) | 4 (0%) | 14 (0%) | 16 (0%) | 14 (0%) | 8 (0%) | 28 (0%) | 52 (1%) | 18 (0%) | 19 (0%) | 24 (0%) | 75 (1%) | 12 (0%) | 80 (1%) | 551 (7%) | 74 (1%) | 39 (0%) | 16 (0%) | 44 (1%) | 20 (0%) | 22 (0%) | 12 (0%) | 37 (0%) | 148 (2%) | 32 (0%) | 30 (0%) | 70 (1%) | 73 (1%) | 45 (1%) | 109 (1%) | 379 (5%) | 48 (1%) | 18 (0%) | 20 (0%) | 17 (0%) | 43 (1%) | 22 (0%) | 92 (1%) | 483 (6%) | 69 (1%) | 31 (0%) | 89 (1%) | 387 (5%) | 83 (1%) | 518 (6%) | NA | NA |
NNN04m (NIST R1-2) & NNN05m (NIST R2)
(nnn04_nnn05_vt <- read_tsv(here("data-raw/tables/nnn04m_nnn05m_venn_table_df.tsv")))%>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site | target_site_total | NNN04m | NNN04m-NNN05m | NNN04m-NNN05m-NSN02m | NNN04m-NSN02m | NNN05m | NNN05m-NSN02m | NSN02m | max_on_target_readcount | max_off_target_readcount |
|---|---|---|---|---|---|---|---|---|---|---|
| A_CCR5_site_3_GM24385 | 4933 | 2186 (44%) | 131 (3%) | 1528 (31%) | 192 (4%) | 141 (3%) | 529 (11%) | 226 (5%) | 21000 | 3108 |
| E_CTLA4_site_9_GM24385 | 1218 | 463 (38%) | 20 (2%) | 323 (27%) | 25 (2%) | 67 (6%) | 238 (20%) | 82 (7%) | 8838 | 3836 |
| F_AAVS1_site_14_GM24385 | 1066 | 221 (21%) | 16 (2%) | 296 (28%) | 19 (2%) | 91 (9%) | 303 (28%) | 120 (11%) | 8808 | 1942 |
TOTAL # OFF-TARGET SITES & REPLICATE PLOTS
Perform additional analyses between all data sets to examine how sequencing depth, number of samples run, and other factors influence concordance of off-target sites.
## Locate files, set up metadata dataframe.
## CREATE METADATA DF
## Using message = FALSE to avoid printing read_tsv message about column specifications
file_list <- as.list(list.files(path = here("data-raw"),
pattern = "identified_matched.txt",
include.dirs = TRUE, full.names = TRUE,
recursive = TRUE))
file_names <- str_remove(file_list, paste0(here("data-raw"),"/"))
changeseq_lst <- set_names(file_list, file_names)
## make that into a data frame
changeseq_metadata_df <- tibble(changeseq_file = unlist(file_names)) %>%
separate(changeseq_file, c("lab","changeseq_run"),
sep = "/", remove = FALSE) %>%
mutate(target_site = str_remove(changeseq_run, "_identified_matched.txt"),
target_site = str_remove(target_site, "CRL[:digit:]*_"))
## READ IN FILES - CREATE 1 DF
changeseq_df <- changeseq_lst %>%
map_dfr(read_tsv, col_types = cols("Start" = col_double(), "End" = col_double(),
"Nuclease_Read_Count" = col_double(), "Control_Read_Count" = col_double(),
"Site_Substitution_Number" = col_double()),
.id = "changeseq_file")
## create small df to link lab with numbers of samples run on sequencer
lab_sample_df <- data.frame(
lab = character(),
num_samples_run = double(),
stringsAsFactors = FALSE
)
## enter lab names
lab_sample_df[1,1] <- "NNN01m"
lab_sample_df[2,1] <- "NNN02m"
lab_sample_df[3,1] <- "NNN03m"
lab_sample_df[4,1] <- "NNN04m"
lab_sample_df[5,1] <- "NNN05m"
lab_sample_df[6,1] <- "NSN01m"
lab_sample_df[7,1] <- "NSN02m"
lab_sample_df[8,1] <- "SSN01m"
lab_sample_df[9,1] <- "SSN02m"
## enter sample numbers
lab_sample_df[1,2] <- 7
lab_sample_df[2,2] <- 7
lab_sample_df[3,2] <- 4
lab_sample_df[4,2] <- 4
lab_sample_df[5,2] <- 4
lab_sample_df[6,2] <- NA
lab_sample_df[7,2] <- NA
lab_sample_df[8,2] <- NA
lab_sample_df[9,2] <- NA
## Annotating with sample metadata
changeseq_anno_df <- changeseq_df %>%
left_join(changeseq_metadata_df) %>%
mutate(changeseq_run = str_remove(changeseq_run, ".*/")) %>%
left_join(lab_sample_df)
#print("LABS INCLUDED IN THIS STUDY:")
#unique(changeseq_anno_df$lab)
#print("TARGET SITES INCLUDED IN THIS STUDY:")
#unique(changeseq_anno_df$target_site)
## color blind palette
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#999999",
"#20F03B","#FEC44F", "#D95F0E", "#756BB1", "#FF8A33", "#D7BE07", "#A807D7")NOTE The CHANGE-Seq manuscript performed analyses on these target sites across different DNA lines. For this analysis we pulled only GM24385 data from that dataset because that is the line the other samples here used. These manuscript samples are the SSNā samples.
## create summary dataframe
summary_df <- changeseq_anno_df %>%
## replace NAs in Site_Substitution_Number col
mutate(Site_Substitution_Number = if_else(is.na(Site_Substitution_Number), -1, Site_Substitution_Number),
## assign read count to on-target col of on-target row per sample
on_target = if_else(Site_Substitution_Number == 0, Nuclease_Read_Count, 0),
## assign off-target read counts in non-on-target rows
off_target = if_else(on_target == 0, Nuclease_Read_Count, 0))
## create df of all off-target rows sorted by decreasing read count value
off_target_df <- summary_df %>%
filter(!Site_Substitution_Number == 0) All Split-Sequencing Samples
Split-sequencing: Wet-lab sample preparation was performed by NIST and the sample was split evenly between NIST and St.Ā Jude to perform sequencing. All of these samples shown here were run through the CHANGE-seq bioinformatics pipeline by Sierra D. Miller. This includes the samples prepared by NIST in March of 2020 and later that fall (aka R2, clean replicate of AEF preparation).
PURPOSE: To investigate variability between replicates sequenced at different locations.
SAMPLES
[sample name: sequencing location: alt name]
- NNN01m: NIST: March 2020
- NSN01m: St.Ā Jude: March 2020
- NNN05m: NIST: R2
- NSN02m: St.Ā Jude: R2
Total Off-Target Sites
The total number of off-target sites (genomic coordinates) per sample per target site. Also noting the number of samples run on the sequencer at the time that sample was run.
## enter labs here to exclude from study -- will make it easier to switch between common chunks of code
lab_disregard <- c("NNN02m", "NNN03m", "NNN04m", "SSN01m", "SSN02m")
## count the number of total genommic coordinates for each target site for each lab
total_genomic_positions_1 <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
group_by(target_site, lab) %>%
## including the genomic coordinate col gave me a "Genomic Coordinate" col so excluding
count(target_site, lab, name = "total_number_of_sites") %>%
## verified numbers by counting rows of identified_matched.txt files for each target site/lab
arrange(target_site, lab) %>%
left_join(lab_sample_df)
ggplotly(ggplot(total_genomic_positions_1) +
geom_bar(aes(x = target_site, y = total_number_of_sites, fill = lab),
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Target Site", y = "Total Number of Sites",
fill = "Lab") + scale_fill_manual(values=cbPalette))total_genomic_positions_1 %>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site | lab | total_number_of_sites | num_samples_run |
|---|---|---|---|
| A_CCR5_site_3 | NNN01m | 791 | 7 |
| A_CCR5_site_3 | NNN05m | 2329 | 4 |
| A_CCR5_site_3 | NSN01m | 937 | NA |
| A_CCR5_site_3 | NSN02m | 2475 | NA |
| E_CTLA4_site_9 | NNN01m | 315 | 7 |
| E_CTLA4_site_9 | NNN05m | 648 | 4 |
| E_CTLA4_site_9 | NSN01m | 346 | NA |
| E_CTLA4_site_9 | NSN02m | 668 | NA |
| F_AAVS1_site_14 | NNN01m | 285 | 7 |
| F_AAVS1_site_14 | NNN05m | 706 | 4 |
| F_AAVS1_site_14 | NSN01m | 308 | NA |
| F_AAVS1_site_14 | NSN02m | 738 | NA |
Overlapping Off-Target Sites Between Replicates & Read Count Comparison
Of the off-target sites that are found in both replicates, how comparable are their read count values?
March 2020 Samples
NNN01m vs.Ā NSN01m
lab_disregard <- c("NNN02m", "NNN03m", "NNN04m", "SSN01m", "SSN02m", "NSN02m", "NNN05m")
## identify sites that are found in both replicates
#################### DF CURATION ############################
## derivate of previous reports' variables' "counts_by_lab"
nnn01_nsn01_overlap_sites <- summary_df %>%
filter(!(lab %in% lab_disregard)) %>%
# Only including off target edits
filter(!Site_Substitution_Number == 0) %>%
## Excluding redundant and unnecessary columns
# Extra columns results in non-matching counts by lab
select("Genomic Coordinate", target_site, lab, Nuclease_Read_Count) %>%
pivot_wider(names_from = "lab",
values_from = "Nuclease_Read_Count",
values_fill = 0) %>%
## NNN0 vs. NNN1: Concordant Off-Target Coordinates
filter(NNN01m != 0, NSN01m != 0) %>%
rowwise() %>%
mutate(AVG = mean(c(NNN01m, NSN01m)),
SD = sd(c(NNN01m, NSN01m)),
CV = SD / AVG) %>%
arrange(desc(CV))
#nnn01_nsn01_overlap_sites %>%
# kbl() %>%
# kable_material(c("striped", "hover"))
nnn01_nsn01_overlap_sites_plot <- nnn01_nsn01_overlap_sites %>%
rename(genomic_coordinate = "Genomic Coordinate") %>%
select(genomic_coordinate, NNN01m, NSN01m, target_site)
# pivot_longer(cols = c("NNN01m", "NSN01m"), names_to = "lab", values_to = "read_count") %>%
# ungroup() %>%
# arrange("read_count")
#ggplotly(ggplot(data=nnn01_nsn01_overlap_sites_plot,aes(x=read_count,fill=lab)) +
# geom_bar(data=subset(nnn01_nsn01_overlap_sites_plot,lab=="NNN01m")) +
# geom_bar(data=subset(nnn01_nsn01_overlap_sites_plot,lab=="NSN01m"),aes(y=..count..*(-1))) +
# scale_y_continuous(breaks=seq(-40,40,10),labels=abs(seq(-40,40,10))) +
# coord_flip())
ggplotly(ggplot(nnn01_nsn01_overlap_sites_plot) +
geom_point(aes(x = NNN01m, y = NSN01m), alpha = 0.25) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~target_site, ncol = 1) +
theme_bw() +
labs(x = "NNN01m Read Counts",
y = "NSN01m Read Counts"))R2 Samples
NNN05m vs.Ā NSN02m
lab_disregard <- c("NNN02m", "NNN03m", "NNN04m", "SSN01m", "SSN02m", "NNN01m", "NSN01m")
nnn05_nsn02_overlap_sites <- summary_df %>%
filter(!(lab %in% lab_disregard)) %>%
# Only including off target edits
filter(!Site_Substitution_Number == 0) %>%
## Excluding redundant and unnecessary columns
# Extra columns results in non-matching counts by lab
select("Genomic Coordinate", target_site, lab, Nuclease_Read_Count) %>%
pivot_wider(names_from = "lab",
values_from = "Nuclease_Read_Count",
values_fill = 0) %>%
## Concordant Off-Target Coordinates
filter(NNN05m != 0, NSN02m != 0) %>%
rowwise() %>%
mutate(AVG = mean(c(NNN05m, NSN02m)),
SD = sd(c(NNN05m, NSN02m)),
CV = SD / AVG) %>%
arrange(desc(CV))
#nnn05_nsn02_overlap_sites %>%
# kbl() %>%
# kable_material(c("striped", "hover"))
nnn05_nsn02_overlap_sites_plot <- nnn05_nsn02_overlap_sites %>%
rename(genomic_coordinate = "Genomic Coordinate") %>%
select(genomic_coordinate, NNN05m, NSN02m, target_site)
ggplotly(ggplot(nnn05_nsn02_overlap_sites_plot) +
geom_point(aes(x = NNN05m, y = NSN02m), alpha = 0.25) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~target_site, ncol = 1) +
theme_bw() +
labs(x = "NNN05m Read Counts",
y = "NSN02m Read Counts"))#(nnn05_nsn02_overlap_sites)Re-Sampling
These samples were prepared once by NIST and sampled twice to perform sequencing.
PURPOSE: To investigate variability between replicates sequenced at the same site from the same library at different times.
NNN03m vs.Ā NNN04m
Total Off-Target Sites
lab_disregard <- c("NNN02m", "NNN01m", "NNN05m", "NSN01m", "NSN02m", "SSN01m", "SSN02m")
## count the number of total genommic coordinates for each target site for each lab
total_genomic_positions_2 <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
group_by(target_site, lab) %>%
## including the genomic coordinate col gave me a "Genomic Coordinate" col so excluding
count(target_site, lab, name = "total_number_of_sites") %>%
## verified numbers by counting rows of identified_matched.txt files for each target site/lab
arrange(target_site, lab) %>%
left_join(lab_sample_df)
ggplotly(ggplot(total_genomic_positions_2) +
geom_bar(aes(x = target_site, y = total_number_of_sites, fill = lab),
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Target Site", y = "Total Number of Sites",
fill = "Lab") + scale_fill_manual(values=cbPalette))total_genomic_positions_2 %>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site | lab | total_number_of_sites | num_samples_run |
|---|---|---|---|
| A_CCR5_site_3 | NNN03m | 2350 | 4 |
| A_CCR5_site_3 | NNN04m | 4037 | 4 |
| E_CTLA4_site_9 | NNN03m | 640 | 4 |
| E_CTLA4_site_9 | NNN04m | 831 | 4 |
| F_AAVS1_site_14 | NNN03m | 541 | 4 |
| F_AAVS1_site_14 | NNN04m | 552 | 4 |
Overlapping Off-Target Sites Between Replicates & Read Count Comparison
Of the sites that are found in both replicates, how comparable are their read count values?
lab_disregard <- c("NNN02m", "NNN01m", "NNN05m", "NSN01m", "NSN02m", "SSN01m", "SSN02m")
## identify sites that are found in both replicates
#################### DF CURATION ############################
## derivate of previous reports' variables' "counts_by_lab"
nnn03_nnn04_overlap_sites <- summary_df %>%
filter(!(lab %in% lab_disregard)) %>%
# Only including off target edits
filter(!Site_Substitution_Number == 0) %>%
## Excluding redundant and unnecessary columns
# Extra columns results in non-matching counts by lab
select("Genomic Coordinate", target_site, lab, Nuclease_Read_Count) %>%
pivot_wider(names_from = "lab",
values_from = "Nuclease_Read_Count",
values_fill = 0) %>%
## NNN0 vs. NNN1: Concordant Off-Target Coordinates
filter(NNN03m != 0, NNN04m != 0) %>%
rowwise() %>%
mutate(AVG = mean(c(NNN03m, NNN04m)),
SD = sd(c(NNN03m, NNN04m)),
CV = SD / AVG) %>%
arrange(desc(CV))
#nnn03_nnn04_overlap_sites %>%
# kbl() %>%
# kable_material(c("striped", "hover"))
nnn03_nnn04_overlap_sites_plot <- nnn03_nnn04_overlap_sites %>%
rename(genomic_coordinate = "Genomic Coordinate") %>%
select(genomic_coordinate, NNN03m, NNN04m, target_site)
ggplotly(ggplot(nnn03_nnn04_overlap_sites_plot) +
geom_point(aes(x = NNN03m, y = NNN04m), alpha = 0.25) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~target_site, ncol = 1) +
theme_bw() +
labs(x = "NNN03m Read Counts",
y = "NNN04m Read Counts"))#(nnn03_nnn04_overlap_sites)NOTE AG camera failure for NNN03m resulted in data loss.
NIST Clean Replicates (AEF)
In the summer-fall 2020, clean replicates were prepared by NIST for target sites A, E, and F.
NNN04m vs.NNN05m
Total Off-Target Sites
lab_disregard <- c("NNN02m", "NNN01m", "NNN03m", "NSN01m", "NSN02m", "SSN01m", "SSN02m")
## count the number of total genommic coordinates for each target site for each lab
total_genomic_positions_3 <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
group_by(target_site, lab) %>%
## including the genomic coordinate col gave me a "Genomic Coordinate" col so excluding
count(target_site, lab, name = "total_number_of_sites") %>%
## verified numbers by counting rows of identified_matched.txt files for each target site/lab
arrange(target_site, lab) %>%
left_join(lab_sample_df)
ggplotly(ggplot(total_genomic_positions_3) +
geom_bar(aes(x = target_site, y = total_number_of_sites, fill = lab),
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Target Site", y = "Total Number of Sites",
fill = "Lab") + scale_fill_manual(values=cbPalette))total_genomic_positions_3 %>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site | lab | total_number_of_sites | num_samples_run |
|---|---|---|---|
| A_CCR5_site_3 | NNN04m | 4037 | 4 |
| A_CCR5_site_3 | NNN05m | 2329 | 4 |
| E_CTLA4_site_9 | NNN04m | 831 | 4 |
| E_CTLA4_site_9 | NNN05m | 648 | 4 |
| F_AAVS1_site_14 | NNN04m | 552 | 4 |
| F_AAVS1_site_14 | NNN05m | 706 | 4 |
Overlapping Off-Target Sites Between Replicates & Read Count Comparison
Of the sites that are found in both replicates, how comparable are their read count values?
lab_disregard <- c("NNN02m", "NNN01m", "NNN03m", "NSN01m", "NSN02m", "SSN01m", "SSN02m")
## identify sites that are found in both replicates
#################### DF CURATION ############################
## derivate of previous reports' variables' "counts_by_lab"
nnn05_nnn04_overlap_sites <- summary_df %>%
filter(!(lab %in% lab_disregard)) %>%
# Only including off target edits
filter(!Site_Substitution_Number == 0) %>%
## Excluding redundant and unnecessary columns
# Extra columns results in non-matching counts by lab
select("Genomic Coordinate", target_site, lab, Nuclease_Read_Count) %>%
pivot_wider(names_from = "lab",
values_from = "Nuclease_Read_Count",
values_fill = 0) %>%
## NNN0 vs. NNN1: Concordant Off-Target Coordinates
filter(NNN05m != 0, NNN04m != 0) %>%
rowwise() %>%
mutate(AVG = mean(c(NNN05m, NNN04m)),
SD = sd(c(NNN05m, NNN04m)),
CV = SD / AVG) %>%
arrange(desc(CV))
#nnn05_nnn04_overlap_sites %>%
# kbl() %>%
# kable_material(c("striped", "hover"))
nnn05_nnn04_overlap_sites_plot <- nnn05_nnn04_overlap_sites %>%
rename(genomic_coordinate = "Genomic Coordinate") %>%
select(genomic_coordinate, NNN04m, NNN05m, target_site)
ggplotly(ggplot(nnn05_nnn04_overlap_sites_plot) +
geom_point(aes(x = NNN04m, y = NNN05m), alpha = 0.25) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~target_site, ncol = 1) +
theme_bw() +
labs(x = "NNN04m Read Counts",
y = "NNN05m Read Counts"))#(nnn05_nnn04_overlap_sites)SEQUENCING DEPTH COMPARISONS
PURPOSE: To investigate whether off-target sites replicate between samples that have higher/lower sequencing depths.
Of the sites that do not replicate between R1-2 (NNN04m) v R2 (NNN05m; present in one sample but not the other) are they found in the CHANGE-Seq manuscript sample GM24385 (replicates 1 and 2: SSN01m and SSN02m) where the sequencing depth was much more?
lab_disregard <- c("NNN02m", "NNN01m", "NNN03m", "NSN01m", "NSN02m", "SSN01m", "SSN02m")
## create df that marks which gen coords are shared between labs
nnn04m_nnn05m_off_target_lab_set <- off_target_df %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
filter(!(lab %in% lab_disregard)) %>%
group_by(`Genomic Coordinate`, target_site) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
nnn04m_nnn05m_venn_df <- off_target_df %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
filter(!(lab %in% lab_disregard))%>%
pivot_wider(names_from = "lab", values_from = "Nuclease_Read_Count", values_fill = NA) %>%
left_join(nnn04m_nnn05m_off_target_lab_set)
## gather coords that are present in only one lab - either one
nnn04_nnn05_discordants <- nnn04m_nnn05m_venn_df %>%
filter( is.na(NNN04m) | is.na(NNN05m)) %>%
# filter(target_site == "E_CTLA4_site_9") %>%
pivot_longer(names_to = "lab", cols = c("NNN04m", "NNN05m"), values_to = "Nuclease_Read_Count") %>%
pivot_wider(names_from = lab, values_from = Nuclease_Read_Count) %>%
select(-lab_set) %>%
rename(Genomic_Coordinate = 'Genomic Coordinate')
## gather the list of genomic coordinates for the two replicates of GM24385 manuscript samples
lab_disregard <- c("NNN02m", "NNN01m", "NNN03m", "NNN04m", "NNN05m", "NSN02m", "NSN01m")
dna_lines_coords <- changeseq_anno_df %>%
filter(!(lab %in% lab_disregard)) %>%
# filter(target_site == "E_CTLA4_site_9") %>%
select(`Genomic Coordinate`, Nuclease_Read_Count, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = Nuclease_Read_Count) %>%
rename(Genomic_Coordinate = 'Genomic Coordinate')
## intersect with the nnn04/05 discordants df.
## take genomic coords in the nnn04/nnn05 list and check to see if same gen coord is in the dna lines list
expanded_coords <- inner_join(nnn04_nnn05_discordants, dna_lines_coords, by = "Genomic_Coordinate")
## get info for printout messages below
total_low_disc <- nrow(nnn04_nnn05_discordants)
total_rep_coords <- nrow(expanded_coords)
both_rep_concord <- expanded_coords %>%
filter(!is.na(SSN01m), !is.na(SSN02m))
total_both_rep_coords <- nrow(both_rep_concord)
one_rep_concord <- expanded_coords %>%
filter(is.na(SSN01m) | is.na(SSN02m))
one_rep_concordant <- nrow(one_rep_concord)
#print("Total number of discordant sites in the lower sequencing depth samples:")
#(total_low_disc)
#print("Total number of discordant sites in lower sequencing depth samples that replicate in one or both of the deeper sequencing samples:")
#(total_rep_coords)
#print("Total number of discordant sites in lower sequencing depth samples that replicate in only one of the deeper sequencing samples:")
#(one_rep_concordant)
#print("Number of discordant sites in lower sequencing depth samples that replicate in both replicates of the deeper sequencing samples:")
#(total_both_rep_coords)
## sanity check
either_concord <- expanded_coords %>%
filter(is.na(SSN01m) | is.na(SSN02m))
either_concord_row <- nrow(either_concord)St.Ā Jude-Sequenced Samples
Off-target sites for all samples that were sequenced at St.Ā Jude (manuscript samples + NIST prepared samples) are compared.
PURPOSE: Constant variable = sequencing location. Experimental variables: Time point, sequencing depth, wet-lab preparation.
SAMPLES
NSN01m: St.Ā Jude sequencing of the NIST-prepared March 2020 sample. NSN02m: St.Ā Jude sequencing of the NIST-prepared R2 (summer/fall 2020) sample SSN01m: CHANGE-Seq manuscript, GM24385, replicate 2 SSN02m: CHANGE-Seq manuscript, GM24385, replicate 2
## str_c joins character vectors via collapse flag.
## by unique lab sorting we are joining labs into unique combinations
## every represented lab set for each target site and genomic coordinate is present
lab_disregard <- c("NNN02m", "NNN01m", "NNN03m", "NNN04m", "NNN05m")
off_target_lab_set <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
# filter(target_site == "E_CTLA4_site_9") %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
group_by(`Genomic Coordinate`, target_site) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
venn_df <- off_target_df %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
filter(!(lab %in% lab_disregard)) %>%
# filter(target_site == "E_CTLA4_site_9") %>%
pivot_wider(names_from = "lab", values_from = "Nuclease_Read_Count", values_fill = NA) %>%
left_join(off_target_lab_set)
ggplotly(ggplot(data = venn_df) +
geom_bar(aes(fct_rev(as_factor(target_site)), fill = lab_set),
# position = "fill"
position = "fill"
) +
coord_flip() +
theme_bw() +
labs(x = "Target Site",
y = "Proportion of Sites",
fill = "Lab Combinations") + scale_fill_manual(values=cbPalette))####### tables
## count sites in lab-sets
(num_sites <- off_target_lab_set %>%
group_by(target_site) %>%
count(lab_set)) %>%
rename("total_off-target_sites" = n) %>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site | lab_set | total_off-target_sites |
|---|---|---|
| A_CCR5_site_3 | NSN01m | 75 |
| A_CCR5_site_3 | NSN01m-NSN02m | 10 |
| A_CCR5_site_3 | NSN01m-NSN02m-SSN01m | 12 |
| A_CCR5_site_3 | NSN01m-NSN02m-SSN01m-SSN02m | 697 |
| A_CCR5_site_3 | NSN01m-NSN02m-SSN02m | 12 |
| A_CCR5_site_3 | NSN01m-SSN01m | 44 |
| A_CCR5_site_3 | NSN01m-SSN01m-SSN02m | 57 |
| A_CCR5_site_3 | NSN01m-SSN02m | 30 |
| A_CCR5_site_3 | NSN02m | 539 |
| A_CCR5_site_3 | NSN02m-SSN01m | 221 |
| A_CCR5_site_3 | NSN02m-SSN01m-SSN02m | 768 |
| A_CCR5_site_3 | NSN02m-SSN02m | 216 |
| A_CCR5_site_3 | SSN01m | 1469 |
| A_CCR5_site_3 | SSN01m-SSN02m | 703 |
| A_CCR5_site_3 | SSN02m | 1224 |
| E_CTLA4_site_9 | NSN01m | 45 |
| E_CTLA4_site_9 | NSN01m-NSN02m | 2 |
| E_CTLA4_site_9 | NSN01m-NSN02m-SSN01m | 3 |
| E_CTLA4_site_9 | NSN01m-NSN02m-SSN01m-SSN02m | 247 |
| E_CTLA4_site_9 | NSN01m-NSN02m-SSN02m | 1 |
| E_CTLA4_site_9 | NSN01m-SSN01m | 10 |
| E_CTLA4_site_9 | NSN01m-SSN01m-SSN02m | 31 |
| E_CTLA4_site_9 | NSN01m-SSN02m | 7 |
| E_CTLA4_site_9 | NSN02m | 226 |
| E_CTLA4_site_9 | NSN02m-SSN01m | 43 |
| E_CTLA4_site_9 | NSN02m-SSN01m-SSN02m | 102 |
| E_CTLA4_site_9 | NSN02m-SSN02m | 44 |
| E_CTLA4_site_9 | SSN01m | 515 |
| E_CTLA4_site_9 | SSN01m-SSN02m | 182 |
| E_CTLA4_site_9 | SSN02m | 474 |
| F_AAVS1_site_14 | NSN01m | 15 |
| F_AAVS1_site_14 | NSN01m-NSN02m | 2 |
| F_AAVS1_site_14 | NSN01m-NSN02m-SSN01m | 3 |
| F_AAVS1_site_14 | NSN01m-NSN02m-SSN01m-SSN02m | 240 |
| F_AAVS1_site_14 | NSN01m-NSN02m-SSN02m | 10 |
| F_AAVS1_site_14 | NSN01m-SSN01m | 5 |
| F_AAVS1_site_14 | NSN01m-SSN01m-SSN02m | 31 |
| F_AAVS1_site_14 | NSN01m-SSN02m | 2 |
| F_AAVS1_site_14 | NSN02m | 237 |
| F_AAVS1_site_14 | NSN02m-SSN01m | 34 |
| F_AAVS1_site_14 | NSN02m-SSN01m-SSN02m | 132 |
| F_AAVS1_site_14 | NSN02m-SSN02m | 80 |
| F_AAVS1_site_14 | SSN01m | 209 |
| F_AAVS1_site_14 | SSN01m-SSN02m | 210 |
| F_AAVS1_site_14 | SSN02m | 732 |
## for number of sites each bin = total # times particular combination shows up
## read counts
(read_range_df <- venn_df %>%
pivot_longer(cols = c("NSN01m", "NSN02m", "SSN01m", "SSN02m"),
names_to = "sample", values_to = "read_counts", values_drop_na = TRUE) %>%
group_by(target_site, lab_set) %>%
summarise(min_read_count = min(read_counts),
max_read_count = max(read_counts))) %>%
rename(target_site = target_site)%>%
kbl() %>%
kable_material(c("striped", "hover")) %>%
kable_styling(font_size = 8)| target_site | lab_set | min_read_count | max_read_count |
|---|---|---|---|
| A_CCR5_site_3 | NSN01m | 6 | 20 |
| A_CCR5_site_3 | NSN01m-NSN02m | 6 | 32 |
| A_CCR5_site_3 | NSN01m-NSN02m-SSN01m | 6 | 114 |
| A_CCR5_site_3 | NSN01m-NSN02m-SSN01m-SSN02m | 6 | 5964 |
| A_CCR5_site_3 | NSN01m-NSN02m-SSN02m | 6 | 42 |
| A_CCR5_site_3 | NSN01m-SSN01m | 6 | 76 |
| A_CCR5_site_3 | NSN01m-SSN01m-SSN02m | 6 | 292 |
| A_CCR5_site_3 | NSN01m-SSN02m | 6 | 34 |
| A_CCR5_site_3 | NSN02m | 6 | 32 |
| A_CCR5_site_3 | NSN02m-SSN01m | 6 | 100 |
| A_CCR5_site_3 | NSN02m-SSN01m-SSN02m | 6 | 1302 |
| A_CCR5_site_3 | NSN02m-SSN02m | 6 | 88 |
| A_CCR5_site_3 | SSN01m | 6 | 120 |
| A_CCR5_site_3 | SSN01m-SSN02m | 6 | 254 |
| A_CCR5_site_3 | SSN02m | 6 | 98 |
| E_CTLA4_site_9 | NSN01m | 6 | 12 |
| E_CTLA4_site_9 | NSN01m-NSN02m | 6 | 22 |
| E_CTLA4_site_9 | NSN01m-NSN02m-SSN01m | 6 | 80 |
| E_CTLA4_site_9 | NSN01m-NSN02m-SSN01m-SSN02m | 6 | 8090 |
| E_CTLA4_site_9 | NSN01m-NSN02m-SSN02m | 6 | 14 |
| E_CTLA4_site_9 | NSN01m-SSN01m | 6 | 50 |
| E_CTLA4_site_9 | NSN01m-SSN01m-SSN02m | 6 | 164 |
| E_CTLA4_site_9 | NSN01m-SSN02m | 6 | 28 |
| E_CTLA4_site_9 | NSN02m | 6 | 36 |
| E_CTLA4_site_9 | NSN02m-SSN01m | 6 | 46 |
| E_CTLA4_site_9 | NSN02m-SSN01m-SSN02m | 6 | 428 |
| E_CTLA4_site_9 | NSN02m-SSN02m | 6 | 50 |
| E_CTLA4_site_9 | SSN01m | 6 | 42 |
| E_CTLA4_site_9 | SSN01m-SSN02m | 6 | 118 |
| E_CTLA4_site_9 | SSN02m | 6 | 52 |
| F_AAVS1_site_14 | NSN01m | 6 | 8 |
| F_AAVS1_site_14 | NSN01m-NSN02m | 6 | 14 |
| F_AAVS1_site_14 | NSN01m-NSN02m-SSN01m | 6 | 20 |
| F_AAVS1_site_14 | NSN01m-NSN02m-SSN01m-SSN02m | 6 | 8024 |
| F_AAVS1_site_14 | NSN01m-NSN02m-SSN02m | 6 | 132 |
| F_AAVS1_site_14 | NSN01m-SSN01m | 6 | 18 |
| F_AAVS1_site_14 | NSN01m-SSN01m-SSN02m | 6 | 442 |
| F_AAVS1_site_14 | NSN01m-SSN02m | 6 | 96 |
| F_AAVS1_site_14 | NSN02m | 6 | 38 |
| F_AAVS1_site_14 | NSN02m-SSN01m | 6 | 36 |
| F_AAVS1_site_14 | NSN02m-SSN01m-SSN02m | 6 | 672 |
| F_AAVS1_site_14 | NSN02m-SSN02m | 6 | 104 |
| F_AAVS1_site_14 | SSN01m | 6 | 36 |
| F_AAVS1_site_14 | SSN01m-SSN02m | 6 | 452 |
| F_AAVS1_site_14 | SSN02m | 6 | 134 |
Discordant Off-Target Sites: lower-sequenced NIST samples vs.Ā higher-sequenced St.Ā Jude samples
NIST-prepared and -sequenced samples (NNN04m: R1-2 & NNN05m: R2) were sequenced at a lower depth. The discordant off-target sites between these samples were pooled and labeled ālow-seqā.
The two replicates from the CHANGE-Seq GM24385 (for target sites A, E, F) were labeled separately: high_seq_rep1 and high_seq_rep2, accordingly.
PURPOSE: Are the discordant off-target sites between lower-sequenced samples recovered when sequencing depth is expanded?
## curate the lower sequencing depth df
low_seq_disc <- nnn04_nnn05_discordants %>%
pivot_longer(cols = c("NNN04m", "NNN05m"), names_to = "lab", values_to = "low_seq_discordant") %>%
filter(!is.na(low_seq_discordant)) %>%
select(Genomic_Coordinate, target_site, low_seq_discordant)
## low_seq_discordant = read count value of either NNN04m or NNN05m
## combine the low_seq_disc sites with the sites for the manuscript samples
low_seq_disc_both_reps_df <- left_join(low_seq_disc, dna_lines_coords, by = "Genomic_Coordinate") %>%
rename(high_seq_rep1 = "SSN01m", high_seq_rep2 = "SSN02m")
## create venn df's
low_seq_lab_set <- low_seq_disc_both_reps_df %>%
select(Genomic_Coordinate, target_site.x, low_seq_discordant, high_seq_rep1, high_seq_rep2) %>%
# mutate_all(~replace(., is.na(.), 0)) %>%
pivot_longer(cols = c("low_seq_discordant", "high_seq_rep1", "high_seq_rep2"),
names_to = "lab", values_to = "Nuclease_Read_Count", values_drop_na = TRUE) %>%
select(Genomic_Coordinate, lab, Nuclease_Read_Count, target_site.x) %>%
group_by(Genomic_Coordinate, target_site.x) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
low_seq_venn_df <- low_seq_disc_both_reps_df %>%
select(Genomic_Coordinate, low_seq_discordant, high_seq_rep1, high_seq_rep2, target_site.x) %>%
left_join(low_seq_lab_set, by = "Genomic_Coordinate")
## plot
ggplotly(ggplot(data = low_seq_venn_df) +
geom_bar(aes(fct_rev(as_factor(target_site.x.x)), fill = lab_set),
position = "fill"
) +
coord_flip() +
theme_bw() +
labs(x = "Target Site",
y = "Proportion of Sites",
fill = "Sample IDs") + scale_fill_manual(values=cbPalette))## for number of sites each bin = total # times particular combination shows up
## count sites in lab-sets
(low_high_num_sites <- low_seq_lab_set %>%
group_by(target_site.x) %>%
count(lab_set)) %>%
rename("total_off-target_sites" = n) %>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site.x | lab_set | total_off-target_sites |
|---|---|---|
| A_CCR5_site_3 | high_seq_rep1-high_seq_rep2-low_seq_discordant | 617 |
| A_CCR5_site_3 | high_seq_rep1-low_seq_discordant | 490 |
| A_CCR5_site_3 | high_seq_rep2-low_seq_discordant | 442 |
| A_CCR5_site_3 | low_seq_discordant | 1499 |
| E_CTLA4_site_9 | high_seq_rep1-high_seq_rep2-low_seq_discordant | 124 |
| E_CTLA4_site_9 | high_seq_rep1-low_seq_discordant | 87 |
| E_CTLA4_site_9 | high_seq_rep2-low_seq_discordant | 87 |
| E_CTLA4_site_9 | low_seq_discordant | 495 |
| F_AAVS1_site_14 | high_seq_rep1-high_seq_rep2-low_seq_discordant | 160 |
| F_AAVS1_site_14 | high_seq_rep1-low_seq_discordant | 37 |
| F_AAVS1_site_14 | high_seq_rep2-low_seq_discordant | 101 |
| F_AAVS1_site_14 | low_seq_discordant | 336 |
## for number of sites each bin = total # times particular combination shows up
## read counts
(low_high_read_range_df <- low_seq_venn_df %>%
pivot_longer(cols = c("low_seq_discordant", "high_seq_rep1", "high_seq_rep2"),
names_to = "sample", values_to = "read_counts", values_drop_na = TRUE) %>%
group_by(target_site.x.x, lab_set) %>%
summarise(min_read_count = min(read_counts),
max_read_count = max(read_counts))) %>%
rename(target_site = target_site.x.x)%>%
kbl() %>%
kable_material(c("striped", "hover")) %>%
kable_styling(font_size = 8)| target_site | lab_set | min_read_count | max_read_count |
|---|---|---|---|
| A_CCR5_site_3 | high_seq_rep1-high_seq_rep2-low_seq_discordant | 6 | 354 |
| A_CCR5_site_3 | high_seq_rep1-low_seq_discordant | 6 | 146 |
| A_CCR5_site_3 | high_seq_rep2-low_seq_discordant | 6 | 218 |
| A_CCR5_site_3 | low_seq_discordant | 6 | 116 |
| E_CTLA4_site_9 | high_seq_rep1-high_seq_rep2-low_seq_discordant | 6 | 986 |
| E_CTLA4_site_9 | high_seq_rep1-low_seq_discordant | 6 | 80 |
| E_CTLA4_site_9 | high_seq_rep2-low_seq_discordant | 6 | 52 |
| E_CTLA4_site_9 | low_seq_discordant | 6 | 52 |
| F_AAVS1_site_14 | high_seq_rep1-high_seq_rep2-low_seq_discordant | 6 | 534 |
| F_AAVS1_site_14 | high_seq_rep1-low_seq_discordant | 6 | 94 |
| F_AAVS1_site_14 | high_seq_rep2-low_seq_discordant | 6 | 134 |
| F_AAVS1_site_14 | low_seq_discordant | 6 | 96 |
All Off-Target Sites: NIST R1-2, R2 vs.Ā CHANGE-Seq Manuscript GM24385 Rep1,Rep2
Venn bar plot between NNN04m (NIST R1-2), NNN05m (R2) and the 2 replicates of the Manuscript GM24385.
lab_disregard <- c("NNN01m", "NNN02m", "NNN03m", "NSN01m", "NSN02m")
full_ven <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
group_by(`Genomic Coordinate`, target_site) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
## plot
ggplotly(ggplot(data = full_ven) +
geom_bar(aes(x = target_site, fill = lab_set),
# position = "fill"
position = "fill"
) +
coord_flip() +
theme_bw() +
labs(x = "Target Site",
y = "Proportion of Sites",
fill = "Lab Combinations") + scale_fill_manual(values=cbPalette) +
theme(axis.text.y = element_text(angle = 90)))##----------------
## all data venn bar
##----------------
## count sites in lab-sets
(full_venn_num_sites <- full_ven %>%
group_by(target_site, lab_set) %>%
count(lab_set)) %>%
rename("total_off-target_sites" = n) %>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site | lab_set | total_off-target_sites |
|---|---|---|
| A_CCR5_site_3 | NNN04m | 1129 |
| A_CCR5_site_3 | NNN04m-NNN05m | 132 |
| A_CCR5_site_3 | NNN04m-NNN05m-SSN01m | 114 |
| A_CCR5_site_3 | NNN04m-NNN05m-SSN01m-SSN02m | 1320 |
| A_CCR5_site_3 | NNN04m-NNN05m-SSN02m | 93 |
| A_CCR5_site_3 | NNN04m-SSN01m | 392 |
| A_CCR5_site_3 | NNN04m-SSN01m-SSN02m | 517 |
| A_CCR5_site_3 | NNN04m-SSN02m | 340 |
| A_CCR5_site_3 | NNN05m | 370 |
| A_CCR5_site_3 | NNN05m-SSN01m | 98 |
| A_CCR5_site_3 | NNN05m-SSN01m-SSN02m | 100 |
| A_CCR5_site_3 | NNN05m-SSN02m | 102 |
| A_CCR5_site_3 | SSN01m | 1142 |
| A_CCR5_site_3 | SSN01m-SSN02m | 288 |
| A_CCR5_site_3 | SSN02m | 947 |
| E_CTLA4_site_9 | NNN04m | 288 |
| E_CTLA4_site_9 | NNN04m-NNN05m | 13 |
| E_CTLA4_site_9 | NNN04m-NNN05m-SSN01m | 14 |
| E_CTLA4_site_9 | NNN04m-NNN05m-SSN01m-SSN02m | 303 |
| E_CTLA4_site_9 | NNN04m-NNN05m-SSN02m | 13 |
| E_CTLA4_site_9 | NNN04m-SSN01m | 60 |
| E_CTLA4_site_9 | NNN04m-SSN01m-SSN02m | 84 |
| E_CTLA4_site_9 | NNN04m-SSN02m | 56 |
| E_CTLA4_site_9 | NNN05m | 207 |
| E_CTLA4_site_9 | NNN05m-SSN01m | 27 |
| E_CTLA4_site_9 | NNN05m-SSN01m-SSN02m | 40 |
| E_CTLA4_site_9 | NNN05m-SSN02m | 31 |
| E_CTLA4_site_9 | SSN01m | 470 |
| E_CTLA4_site_9 | SSN01m-SSN02m | 135 |
| E_CTLA4_site_9 | SSN02m | 426 |
| F_AAVS1_site_14 | NNN04m | 138 |
| F_AAVS1_site_14 | NNN04m-NNN05m | 12 |
| F_AAVS1_site_14 | NNN04m-NNN05m-SSN01m | 5 |
| F_AAVS1_site_14 | NNN04m-NNN05m-SSN01m-SSN02m | 270 |
| F_AAVS1_site_14 | NNN04m-NNN05m-SSN02m | 25 |
| F_AAVS1_site_14 | NNN04m-SSN01m | 11 |
| F_AAVS1_site_14 | NNN04m-SSN01m-SSN02m | 55 |
| F_AAVS1_site_14 | NNN04m-SSN02m | 36 |
| F_AAVS1_site_14 | NNN05m | 198 |
| F_AAVS1_site_14 | NNN05m-SSN01m | 26 |
| F_AAVS1_site_14 | NNN05m-SSN01m-SSN02m | 105 |
| F_AAVS1_site_14 | NNN05m-SSN02m | 65 |
| F_AAVS1_site_14 | SSN01m | 209 |
| F_AAVS1_site_14 | SSN01m-SSN02m | 183 |
| F_AAVS1_site_14 | SSN02m | 698 |
## read counts
lab_disregard <- c("NNN01m", "NNN02m", "NNN03m", "NSN01m", "NSN02m")
## rename col to perform left join
full_ven <- full_ven %>%
rename(Genomic_Coordinate = 'Genomic Coordinate')
## get read counts and lab set in one df
full_venn_df <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
rename(Genomic_Coordinate = 'Genomic Coordinate') %>%
left_join(full_ven, by = "Genomic_Coordinate")
## get read count range for lab sets
(full_venn_read_range_df <- full_venn_df %>%
group_by(target_site.x, lab_set) %>%
summarise(min_read_count = min(Nuclease_Read_Count),
max_read_count = max(Nuclease_Read_Count))) %>%
kbl() %>%
kable_material(c("striped", "hover"))| target_site.x | lab_set | min_read_count | max_read_count |
|---|---|---|---|
| A_CCR5_site_3 | NNN04m | 6 | 116 |
| A_CCR5_site_3 | NNN04m-NNN05m | 6 | 218 |
| A_CCR5_site_3 | NNN04m-NNN05m-SSN01m | 6 | 148 |
| A_CCR5_site_3 | NNN04m-NNN05m-SSN01m-SSN02m | 6 | 5964 |
| A_CCR5_site_3 | NNN04m-NNN05m-SSN02m | 6 | 162 |
| A_CCR5_site_3 | NNN04m-SSN01m | 6 | 146 |
| A_CCR5_site_3 | NNN04m-SSN01m-SSN02m | 6 | 354 |
| A_CCR5_site_3 | NNN04m-SSN02m | 6 | 218 |
| A_CCR5_site_3 | NNN05m | 6 | 36 |
| A_CCR5_site_3 | NNN05m-SSN01m | 6 | 80 |
| A_CCR5_site_3 | NNN05m-SSN01m-SSN02m | 6 | 120 |
| A_CCR5_site_3 | NNN05m-SSN02m | 6 | 44 |
| A_CCR5_site_3 | SSN01m | 6 | 108 |
| A_CCR5_site_3 | SSN01m-SSN02m | 6 | 100 |
| A_CCR5_site_3 | SSN02m | 6 | 98 |
| E_CTLA4_site_9 | NNN04m | 6 | 52 |
| E_CTLA4_site_9 | NNN04m-NNN05m | 6 | 44 |
| E_CTLA4_site_9 | NNN04m-NNN05m-SSN01m | 6 | 46 |
| E_CTLA4_site_9 | NNN04m-NNN05m-SSN01m-SSN02m | 6 | 8090 |
| E_CTLA4_site_9 | NNN04m-NNN05m-SSN02m | 6 | 52 |
| E_CTLA4_site_9 | NNN04m-SSN01m | 6 | 50 |
| E_CTLA4_site_9 | NNN04m-SSN01m-SSN02m | 6 | 986 |
| E_CTLA4_site_9 | NNN04m-SSN02m | 6 | 52 |
| E_CTLA4_site_9 | NNN05m | 6 | 32 |
| E_CTLA4_site_9 | NNN05m-SSN01m | 6 | 80 |
| E_CTLA4_site_9 | NNN05m-SSN01m-SSN02m | 6 | 96 |
| E_CTLA4_site_9 | NNN05m-SSN02m | 6 | 46 |
| E_CTLA4_site_9 | SSN01m | 6 | 42 |
| E_CTLA4_site_9 | SSN01m-SSN02m | 6 | 86 |
| E_CTLA4_site_9 | SSN02m | 6 | 34 |
| F_AAVS1_site_14 | NNN04m | 6 | 96 |
| F_AAVS1_site_14 | NNN04m-NNN05m | 6 | 50 |
| F_AAVS1_site_14 | NNN04m-NNN05m-SSN01m | 6 | 50 |
| F_AAVS1_site_14 | NNN04m-NNN05m-SSN01m-SSN02m | 6 | 8024 |
| F_AAVS1_site_14 | NNN04m-NNN05m-SSN02m | 6 | 132 |
| F_AAVS1_site_14 | NNN04m-SSN01m | 6 | 94 |
| F_AAVS1_site_14 | NNN04m-SSN01m-SSN02m | 6 | 262 |
| F_AAVS1_site_14 | NNN04m-SSN02m | 6 | 134 |
| F_AAVS1_site_14 | NNN05m | 6 | 38 |
| F_AAVS1_site_14 | NNN05m-SSN01m | 6 | 36 |
| F_AAVS1_site_14 | NNN05m-SSN01m-SSN02m | 6 | 534 |
| F_AAVS1_site_14 | NNN05m-SSN02m | 6 | 104 |
| F_AAVS1_site_14 | SSN01m | 6 | 36 |
| F_AAVS1_site_14 | SSN01m-SSN02m | 6 | 280 |
| F_AAVS1_site_14 | SSN02m | 6 | 134 |
temp <- full_venn_df %>% filter(lab_set == "NNN04m-SSN01m-SSN02m")
# write.table(temp, file="~/Desktop/temp.tsv", sep="\t")Denisty Plots
Target Site A: A_CCR5_site_3
SAMPLES
- NNN04m: NIST sequencing of R1-2
- NNN05m: NIST sequencing of R2
- SSN01m: CHANGE-Seq Manuscript GM24385 Rep1
- SSN02m: CHANGE-Seq Manuscript GM24385 Rep1
lab_disregard <- c("NNN01m", "NNN02m", "NNN03m", "NSN01m", "NSN02m")
A_lab_set <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
filter(target_site == "A_CCR5_site_3") %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
group_by(`Genomic Coordinate`) %>%
summarise(lab_set = str_c(sort(lab), collapse = "-"))
A_rep_count_scatter <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
filter(target_site == "A_CCR5_site_3") %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
right_join(A_lab_set)
## need 31 colors for plot
cbPalette2 <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#999999",
"#20F03B","#FEC44F", "#D95F0E", "#756BB1", "#FF8A33", "#D7BE07", "#A807D7", #16
"#E9967A", "#CD5C5C", "#DFFF00", "#DE3163", "#9FE2BF", "#40E0D0", "#CCCCFF", #23
"#800000", "#808000", "#008080", "#800080", "#C5CAE9", "#616161", "#827717", "#6D4C41") #31
ggplotly(ggplot(A_rep_count_scatter) +
geom_histogram(aes(x = Nuclease_Read_Count, fill = lab_set), position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual", palette = 1) +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations") + scale_fill_manual(values=cbPalette2))## number of sites that are above a 100 read count value
A_abov_100 <- A_rep_count_scatter %>%
filter(lab_set == "NNN04m-NNN05m-SSN01m-SSN02m") %>%
filter(Nuclease_Read_Count > 99) %>%
select('Genomic Coordinate') %>%
unique()
A_abov100_min_reads <- A_rep_count_scatter %>%
pivot_wider(names_from = "lab", values_from = "Nuclease_Read_Count", values_fill = NA) %>%
mutate(min_read_count = pmin(NNN04m, NNN05m, SSN01m, SSN02m, na.rm=TRUE)) %>%
filter(min_read_count > 99) %>%
select('Genomic Coordinate') %>%
unique()
# NNN04m-NNN05m-SSN01m-SSN02m
labs <- c("NNN04m", "NNN05m", "SSN01m", "SSN02m")
A_disc_above_100 <- A_rep_count_scatter %>%
filter(lab_set %in% labs) %>%
filter(Nuclease_Read_Count > 99) %>%
select('Genomic Coordinate') %>%
unique()Target Site E: E_CTLA4_site_9
Samples
- NNN04m: NIST sequencing of R1-2
- NNN05m: NIST sequencing of R2
- SSN01m: CHANGE-Seq Manuscript GM24385 Rep1
- SSN02m: CHANGE-Seq Manuscript GM24385 Rep1
lab_disregard <- c("NNN01m", "NNN02m", "NNN03m", "NSN01m", "NSN02m")
E_lab_set <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
filter(target_site == "E_CTLA4_site_9") %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
group_by(`Genomic Coordinate`) %>%
summarise(lab_set = str_c(sort(lab), collapse = "-"))
E_rep_count_scatter <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
filter(target_site == "E_CTLA4_site_9") %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
right_join(E_lab_set)
## need 31 colors for plot
cbPalette2 <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#999999",
"#20F03B","#FEC44F", "#D95F0E", "#756BB1", "#FF8A33", "#D7BE07", "#A807D7", #16
"#E9967A", "#CD5C5C", "#DFFF00", "#DE3163", "#9FE2BF", "#40E0D0", "#CCCCFF", #23
"#800000", "#808000", "#008080", "#800080", "#C5CAE9", "#616161", "#827717", "#6D4C41") #31
ggplotly(ggplot(E_rep_count_scatter) +
geom_histogram(aes(x = Nuclease_Read_Count, fill = lab_set), position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual", palette = 1) +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations") + scale_fill_manual(values=cbPalette2))E_abov_100 <- E_rep_count_scatter %>%
filter(lab_set == "NNN04m-NNN05m-SSN01m-SSN02m") %>%
filter(Nuclease_Read_Count > 99) %>%
select('Genomic Coordinate') %>%
unique()
E_abov100_min_reads <- E_rep_count_scatter %>%
pivot_wider(names_from = "lab", values_from = "Nuclease_Read_Count", values_fill = NA) %>%
mutate(min_read_count = pmin(NNN04m, NNN05m, SSN01m, SSN02m, na.rm=TRUE)) %>%
filter(min_read_count > 99) %>%
select('Genomic Coordinate') %>%
unique()
labs <- c("NNN04m", "NNN05m", "SSN01m", "SSN02m")
E_disc_above_100 <- E_rep_count_scatter %>%
filter(lab_set %in% labs) %>%
filter(Nuclease_Read_Count > 99) %>%
select('Genomic Coordinate') %>%
unique()Target Site F: F_AAVS1_site_14
Samples
- NNN04m: NIST sequencing of R1-2
- NNN05m: NIST sequencing of R2
- SSN01m: CHANGE-Seq Manuscript GM24385 Rep1
- SSN02m: CHANGE-Seq Manuscript GM24385 Rep1
lab_disregard <- c("NNN01m", "NNN02m", "NNN03m", "NSN01m", "NSN02m")
F_lab_set <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
filter(target_site == "F_AAVS1_site_14") %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
group_by(`Genomic Coordinate`) %>%
summarise(lab_set = str_c(sort(lab), collapse = "-"))
F_rep_count_scatter <- off_target_df %>%
filter(!(lab %in% lab_disregard)) %>%
filter(target_site == "F_AAVS1_site_14") %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
right_join(F_lab_set)
## need 31 colors for plot
cbPalette2 <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#999999",
"#20F03B","#FEC44F", "#D95F0E", "#756BB1", "#FF8A33", "#D7BE07", "#A807D7", #16
"#E9967A", "#CD5C5C", "#DFFF00", "#DE3163", "#9FE2BF", "#40E0D0", "#CCCCFF", #23
"#800000", "#808000", "#008080", "#800080", "#C5CAE9", "#616161", "#827717", "#6D4C41") #31
ggplotly(ggplot(F_rep_count_scatter) +
geom_histogram(aes(x = Nuclease_Read_Count, fill = lab_set), position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual", palette = 1) +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations") + scale_fill_manual(values=cbPalette2))F_abov_100 <- F_rep_count_scatter %>%
filter(lab_set == "NNN04m-NNN05m-SSN01m-SSN02m") %>%
filter(Nuclease_Read_Count > 99) %>%
select('Genomic Coordinate') %>%
unique()
F_abov100_min_reads <- F_rep_count_scatter %>%
pivot_wider(names_from = "lab", values_from = "Nuclease_Read_Count", values_fill = NA) %>%
mutate(min_read_count = pmin(NNN04m, NNN05m, SSN01m, SSN02m, na.rm=TRUE)) %>%
filter(min_read_count > 99) %>%
select('Genomic Coordinate') %>%
unique()
labs <- c("NNN04m", "NNN05m", "SSN01m", "SSN02m")
F_disc_above_100 <- F_rep_count_scatter %>%
filter(lab_set %in% labs) %>%
filter(Nuclease_Read_Count > 99) %>%
select('Genomic Coordinate') %>%
unique()NNN05m vs.Ā NSN02m
labs <- c("NNN05m", "NSN02m")
nnn05m_nsn02m_lab_set <- off_target_df %>%
filter(lab %in% labs) %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
group_by(`Genomic Coordinate`, target_site) %>%
summarise(lab_set = str_c(sort(lab), collapse = "-"))## `summarise()` has grouped output by 'Genomic Coordinate'. You can override using the `.groups` argument.
nnn05m_nsn02m_rep_count_scatter_df <- off_target_df %>%
filter(lab %in% labs) %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
right_join(nnn05m_nsn02m_lab_set) %>%
count(target_site, lab_set, Nuclease_Read_Count, name = "Occurrence")## Joining, by = c("Genomic Coordinate", "target_site")
ggplotly(ggplot(nnn05m_nsn02m_rep_count_scatter_df) +
geom_histogram(aes(x = Nuclease_Read_Count, fill = lab_set), position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual", palette = 1) +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations") + scale_fill_manual(values=cbPalette2))## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Concordance Plots
R1-1 vs.Ā CHANGE-Seq Manuscript Replicates
Replicates 1 and 2 are combined and termed āManuscriptā. When there are concordant read count values for a given off-target site, the lower read count value is absorbed.
## combine replicate 1 and 2 of change-seq manuscript
lab_disregard <- c("NNN01m", "NSN01m", "NNN02m", "NNN04m", "NNN05m", "NSN02m", "NNN03m")
# combine replicates for the changeseq manuscript samples.
# for each target site, those that are concordant, take the lowest read count
manuscript_df <- off_target_df %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
filter(!(lab %in% lab_disregard)) %>%
pivot_wider(names_from = "lab", values_from = "Nuclease_Read_Count", values_fill = NA) %>%
mutate(min_read_count = pmin(SSN01m, SSN02m, na.rm=TRUE)) %>%
select(`Genomic Coordinate`, target_site, min_read_count) %>%
rename(Manuscript = min_read_count)
## create df with the 3 sample off-targets, combine replicates from manuscript
lab_disregard <- c("NNN01m", "NSN01m", "NNN02m", "NNN04m", "NNN05m", "NSN02m", "SSN01m", "SSN02m")
root_df <- off_target_df %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
filter(!(lab %in% lab_disregard)) %>%
pivot_wider(names_from = "lab", values_from = "Nuclease_Read_Count", values_fill = NA) %>%
full_join(manuscript_df) %>%
pivot_longer(names_to = "lab", values_to = "Nuclease_Read_Count", cols = c("NNN03m","Manuscript")) %>%
drop_na
lab_set <- root_df %>%
select(`Genomic Coordinate`, target_site, lab, Nuclease_Read_Count) %>%
group_by(`Genomic Coordinate`, target_site) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
concordance_df <- root_df %>%
pivot_wider(names_from = "lab", values_from = "Nuclease_Read_Count", values_fill = NA) %>%
left_join(lab_set) %>%
mutate(min_read_count = pmin(NNN03m, Manuscript, na.rm=TRUE)) %>%
mutate(concordance = if_else(str_detect(lab_set, "-",),
"concordant", "discordant"))
rep_count_occurrence <- concordance_df %>%
select('Genomic Coordinate', target_site, lab_set, min_read_count) %>%
count(target_site, lab_set, min_read_count, name = "occurrence")
occurrence_concordance_df <- rep_count_occurrence %>%
mutate(concordance = if_else(str_detect(lab_set, "-",),
"concordant", "discordant")) %>%
select(target_site, occurrence, -lab_set, concordance) %>%
group_by(target_site, concordance) %>%
summarise(total_occurrence = sum(occurrence)) %>%
pivot_wider(names_from = concordance,
values_from = total_occurrence,
values_fill = NA) %>%
rowwise() %>%
mutate(total_sites = sum(concordant, discordant))
## df: extend upon previous df to add total % concordant & discordant
total_concordance_wide_df <- occurrence_concordance_df %>%
mutate(total_pct_concordant = 100 * concordant / (concordant + discordant),
total_pct_discordant = 100 * discordant / (concordant + discordant)) %>%
mutate_if(is.numeric, ~round(., 1)) %>%
rename(total_concordant = concordant,
total_discordant = discordant)
pct_concordance_df <- rep_count_occurrence %>%
pivot_wider(names_from = lab_set, values_from = occurrence, values_fill = 0) %>%
group_by(target_site) %>%
arrange(min_read_count) %>%
left_join(total_concordance_wide_df) %>%
mutate(pct_concordant = (total_concordant-(lag(cumsum(`Manuscript-NNN03m`), default = 0))) / total_concordant,
pct_discordant = (total_discordant- (lag(cumsum(Manuscript + NNN03m), default = 0))) / total_discordant)
disc_sites_df <- pct_concordance_df %>%
mutate(concordant_sites = round(total_concordant * pct_concordant),
discordant_sites = round(total_discordant * pct_discordant)) Target Site: CCR5 site 3 (āAā)
A_disc_sites_df <- disc_sites_df %>%
filter(target_site == "A_CCR5_site_3") %>%
rename(concordant = concordant_sites,
discordant = discordant_sites) %>%
pivot_longer(names_to = "concordance", cols = c("concordant", "discordant"), values_to = "sites")
ggplotly(ggplot(A_disc_sites_df) +
geom_bar(aes(x = min_read_count, y = sites, fill = concordance),
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Read Count", y = "Number of Sites",
fill = "Concordance"))Target Site: CTLA4 site 9 (āEā)
F_disc_sites_df <- disc_sites_df %>%
filter(target_site == "F_AAVS1_site_14") %>%
rename(concordant = concordant_sites,
discordant = discordant_sites) %>%
pivot_longer(names_to = "concordance", cols = c("concordant", "discordant"), values_to = "sites")
ggplotly(ggplot(F_disc_sites_df) +
geom_bar(aes(x = min_read_count, y = sites, fill = concordance),
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Read Count", y = "Number of Sites",
fill = "Concordance"))Target Site: AAVS1 site 14 (āFā)
F_disc_sites_df <- disc_sites_df %>%
filter(target_site == "F_AAVS1_site_14") %>%
rename(concordant = concordant_sites,
discordant = discordant_sites) %>%
pivot_longer(names_to = "concordance", cols = c("concordant", "discordant"), values_to = "sites")
ggplotly(ggplot(F_disc_sites_df) +
geom_bar(aes(x = min_read_count, y = sites, fill = concordance),
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Read Count", y = "Number of Sites",
fill = "Concordance"))